Completed
Push — develop ( 322e3f...d82e9f )
by Dylan
02:40
created

crawler.BEFORE_INIT   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
c 1
b 0
f 0
nc 3
nop 0
dl 0
loc 9
rs 9.6666
1
const default_tests = {
2
3
    // [name, title, headers, type]
4
    tests: [
5
        ['bad_links', 'BAD LINKS', ['URL', 'Linked From'], ''],
6
        ['h1_info', 'H1 INFO', ['URL', 'Count', 'Text', 'Status']],
7
        ['h2_info', 'H2 INFO', ['URL', 'Count', 'Text', 'Status']],
8
        ['word_count', 'WORD COUNT', ['URL', 'Word Count', 'Article Word Count'], 'info'],
9
        ['int_link_info', 'INTERNAL LINKS',
10
            ['URL', 'Article Links', 'Article Link Count', 'Article Density', 'Total Link Count', 'Total Density', 'Status'],
11
            'info'],
12
        ['ext_link_info', 'EXTERNAL LINKS', ['URL', 'External Link Count', 'External Links'], 'success'],
13
        ['img_info', 'IMAGES', ['URL', 'Count', 'Missing Alt Tag', 'Missing Title Tag', 'Fields Missing Images', 'Status'], 'success'],
14
        ['title_info', 'META TITLE', ['URL', 'Meta Title', 'Length', 'Status']],
15
        ['description_info', 'META DESCRIPTION', ['URL', 'Meta Description', 'Length', 'Status']],
16
        ['canonical_info', 'CANONICAL', ['URL', 'Status'], 'success'],
17
        ['noindex_pages', 'NO-INDEX PAGES', ['URL'], 'success'],
18
        ['urls_test', 'URL STRUCTURE', ['URL', 'Status'], 'success'],
19
        ['duplicate_meta_tags', 'DUPLICATE META TAGS', ['URL', 'Status']],
20
        ['href_langs', 'LANG TAGS', ['URL', 'Tags'], 'info'],
21
        ['orphan_pages', 'ORPHAN PAGES', ['URL']],
22
        ['redirect_links', 'REDIRECT LINKS', ['Link', 'In', 'Redirects To']],
23
    ],
24
25
    /**
26
     * Test the h1s on the page provided
27
     *
28
     * @param {string} url
29
     * @param {jQuery} html
30
     * @returns {undefined}
31
     */
32
    h1_info: function(url, html){
33
        var h1      = html.find( 'h1' ),
34
            link    = crawler.painter.create_link(url, url),
35
            joined  = [],
36
            status;
37
38
        h1.each(function(){ joined.push(this.innerHTML); });
39
40
        if(h1.length != 1)
41
            status = crawler.painter.create_status('error', (h1.length < 1) ? 'Missing H1' : 'Multiple H1 tags');
42
        else status = crawler.painter.create_status('success', 'OK!');
43
44
        return crawler.painter.add_row(this.name, [link, h1.length, joined.join(', '), status]);
45
    },
46
47
    /**
48
     * Test the h2s on the page provided
49
     *
50
     * @param {string} url
51
     * @param {jQuery} html
52
     * @returns {undefined}
53
     */
54
    h2_info: function(url, html){
55
        var h2      = html.find( 'h2' ),
56
            link    = crawler.painter.create_link(url, url),
57
            joined  = [], status;
58
59
        h2.each(function(){ joined.push(this.innerHTML); });
60
61
        if(h2.length < 1) status = crawler.painter.create_status('warning', 'Missing H2');
62
        else status = crawler.painter.create_status('success', 'OK!');
63
64
        return crawler.painter.add_row(this.name, [link, h2.length, joined.join(', '), status]);
65
    },
66
67
    /**
68
     * Check the word count for the passed page
69
     *
70
     * @param {string} url
71
     * @param {jQuery} html
72
     * @param {string} headers
73
     * @param {Array} field_data
74
     * @param {Array} phrases
75
     * @returns {undefined}
76
     */
77
    word_count: function(url, html, headers, field_data, phrases){
78
        var link        = crawler.painter.create_link(url, url),
79
            word_count  = crawler.get_word_count(phrases),
80
            art_count   = crawler.get_word_count(field_data[3]);
81
82
        return crawler.painter.add_row(this.name, [link, word_count, art_count]);
83
    },
84
85
    /**
86
     * Test the internal links found on the page provided
87
     *
88
     * @param {string} url
89
     * @param {jQuery} html
90
     * @param {string} headers
91
     * @param {Array} field_data
92
     * @param {Array} phrases
93
     * @returns {undefined}
94
     */
95
    int_link_info: function(url, html, headers, field_data, phrases){
96
        var link = crawler.painter.create_link(url, url),
97
            art_links = [], links = [];
98
99
        // Article links
100
        for( var field in field_data[2] ) {
101
            $.each($(field_data[2][field]).find('a'), function () {
102
                var href = $(this).attr('href');
103
                if(href && !crawler.is_external(href) && !crawler.is_anchor(href, url)) art_links.push(href);
104
            });
105
        }
106
107
        // Full page links
108
        $.each(html.find('a'), function () {
109
            var href = $(this).attr('href');
110
            if(href && !crawler.is_external(href) && !crawler.is_anchor(href, url)) links.push(href);
111
        });
112
113
        var art_word_count  = crawler.get_word_count(field_data[3]),
114
            art_density     = (art_links.length > 0) ? art_word_count / art_links.length : false,
115
            art_dens_text   = (art_density != false) ? art_density.toFixed(2) +' words/link' : 'No internal links',
116
            word_count      = crawler.get_word_count(phrases),
117
            density         = (links.length > 0) ? word_count / links.length : false,
118
            dens_text       = (density != false) ? density.toFixed(2) +' words/link' : 'No internal links',
119
            status          = crawler.painter.create_status('success', 'OK!');
120
121
        if( ( art_density !== false && art_density < 100 ) )
122
            status = crawler.painter.create_status('warning', 'This page might be considered spammy');
123
124
        if(links.length > 0)
125
            crawler.painter.add_row( this.name, [
126
                link, art_links.join('<br />'), art_links.length, art_dens_text, links.length, dens_text, status
127
            ]);
128
129
        return undefined;
130
    },
131
132
    /**
133
     * Test the external links on the page provided
134
     *
135
     * @param {string} url
136
     * @param {jQuery} html
137
     * @param {string} headers
138
     * @param {Array} field_data
139
     * @returns {undefined}
140
     */
141
    ext_link_info: function(url, html, headers, field_data){
142
        var link = crawler.painter.create_link(url, url),
143
            links = [];
144
145
        for( var field in field_data[2] ) {
146
            $.each($(field_data[2][field]).find('a'), function () {
147
                var $this = $(this),
148
                    href = $this.attr('href');
149
                if(href && crawler.is_external(href)){
150
                    var type = ( !$this.attr('rel') || $this.attr('rel').toLowerCase().indexOf('nofollow') < 0 )
151
                        ? 'warning' : 'info';
152
                    links.push(
153
                        $('<div class="clearfix"></div>').append([
154
                            crawler.painter.create_status(type, href),
155
                            '<p>&nbsp;</p>'
156
                        ])
157
                    );
158
                }
159
            });
160
        }
161
162
        if(links.length > 0){
163
            crawler.painter.add_row(this.name, [link, links.length, links]);
164
        }
165
166
        return undefined;
167
    },
168
169
    /**
170
     * Test the images on the page provided
171
     *
172
     * @param {string} url
173
     * @param {jQuery} html
174
     * @param {string} headers
175
     * @param {Array} field_data
176
     * @returns {undefined}
177
     */
178
    img_info: function(url, html, headers, field_data) {
179
        var link = crawler.painter.create_link(url, url),
180
            imgs = html.find('img'),
181
            alt = 0, title = 0, fields = [], status = '';
182
183
        // Check alt and title tags
184
        $.each(imgs, function () {
185
            var $this = $(this);
186
            if (!$this.attr('alt') || $this.attr('alt').length < 1) alt += 1;
187
            if (!$this.attr('title') || $this.attr('title').length < 1) title += 1;
188
        });
189
190
        // Check the fields
191
        for (var f in field_data[2]) if ($(field_data[2][f]).find('img').length < 1) fields.push(field_data[1][f]);
192
193
        // Construct Result
194
        if (alt > 0)
195
            status = crawler.painter.create_status('error',
196
                (alt > 1) ? alt + ' images missing alt tag' : '1 image missing alt tag');
197
        else if(fields.length > 0)
198
            status = crawler.painter.create_status('warning',
199
                (fields.length > 1) ? fields.join(' and ') + ' are missing images' : fields[0] + ' is missing images');
200
        else if(title > 0)
201
            status = crawler.painter.create_status('info',
202
                (title > 1) ? title + ' images missing title tag' : '1 image is missing title tag');
203
        else
204
            status = crawler.painter.create_status('success', 'OK!');
205
206
        return crawler.painter.add_row(this.name, [link, imgs.length, alt, title, fields.join(', '), status]);
207
    },
208
209
    /**
210
     * Test the meta title of the page provided
211
     *
212
     * @param {string} url
213
     * @param {jQuery} html
214
     * @returns {undefined}
215
     */
216
    title_info: function(url, html){
217
        var title   = html.filter( 'title' ),
218
            link    = crawler.painter.create_link(url, url),
219
            text    = (title.length == 1) ? title.html() : '',
220
            status  = default_tests.get_meta_tags_status(title, 'meta title', text, 40, 56);
221
222
        if(title.length == 1){
223
            crawler.set_property('meta_titles', text, url);
224
        }
225
226
        return crawler.painter.add_row(this.name, [link, text, text.length, status]);
227
    },
228
229
    /**
230
     * Test the meta description for the page provided
231
     *
232
     * @param {string} url
233
     * @param {jQuery} html
234
     * @returns {undefined}
235
     */
236
    description_info: function(url, html){
237
        var desc    = html.filter( 'meta[name=description]' ),
238
            link    = crawler.painter.create_link(url, url),
239
            text    = (desc.length == 1) ? desc.attr('content') : '',
240
            status  = default_tests.get_meta_tags_status(desc, 'meta description', text, 70, 156);
241
242
        if( desc.length == 1 ){
243
            crawler.set_property('descriptions', text, url);
244
        }
245
246
        return crawler.painter.add_row(this.name, [link, text, text.length, status]);
247
    },
248
249
    /**
250
     * Test the canonical rules for the page provided
251
     *
252
     * @param {string} url
253
     * @param {jQuery} html
254
     * @returns {undefined}
255
     */
256
    canonical_info: function(url, html){
257
        var tags = default_tests.get_tags(html, 'link', 'rel', 'canonical');
258
259
        if(tags.length != 1) {
260
            var status = crawler.painter.create_status('error', 'Missing / Multiple canonicals found');
261
            crawler.painter.add_row(this.name, [crawler.painter.create_link(url, url), status]);
262
        }else{
263
            crawler.set_property('canonicals', tags[0].attr('href'), url);
264
        }
265
266
        return undefined;
267
    },
268
269
    /**
270
     * Check if the page provided has a no-index header
271
     *
272
     * @param {string} url
273
     * @param {jQuery} html
274
     * @returns {undefined}
275
     */
276
    noindex_pages: function(url, html) {
277
        if(default_tests.get_tags(html, 'meta', 'content', 'noindex').length > 0){
278
            crawler.painter.add_row(this.name, [crawler.painter.create_link(url, url)]);
279
            crawler.painter.set_type(this.name, 'error');
280
        }
281
282
        return undefined;
283
    },
284
285
    /**
286
     * Test the url passed for it's structure
287
     *
288
     * @param url
289
     * @returns {undefined}
290
     */
291
    urls_test: function(url){
292
        var link = crawler.painter.create_link(url, url),
293
            msg;
294
295
        if( url.length > 115 )                  msg = 'URL is too long';
296
        else if( url.toLowerCase() != url )     msg = 'URL is not in lower case';
297
        else if( url.replace('_','') !== url )  msg = 'URL contains under scores';
298
        else return undefined;
299
300
        return crawler.painter.add_row(this.name, [link, crawler.painter.create_status('warning', msg)]);
301
    },
302
303
    /**
304
     * Check for href lang tags in the page provided
305
     *
306
     * @param {string} url
307
     * @param {jQuery} html
308
     * @returns {undefined}
309
     */
310
    href_langs: function(url, html){
311
        var link    = crawler.painter.create_link(url, url),
312
            tags    = [];
313
314
        $.each( html.filter( 'link' ), function(){
315
            if( $(this).attr( 'hreflang' ) )
316
                tags.push( $('<p>').text( $(this).clone().wrap('<p>').parent().html() ).html() );
317
        });
318
319
        if( tags.length > 0 ) crawler.painter.add_row(this.name, [link, tags.join('<br />')] );
320
321
        return undefined;
322
    },
323
324
    /**
325
     * Returns a list of jQuery Objects that are of type {tag},
326
     * have an attribute {key} an it's value is {value}
327
     *
328
     * @param {jQuery} html
329
     * @param {string} tag
330
     * @param {string} key
331
     * @param {string} value
332
     * @returns {Array}
333
     */
334
    get_tags: function(html, tag, key, value){
335
        var returns = [];
336
337
        $.each(html.filter(tag), function(){
338
            var $this = $(this);
339
            if( $this.attr(key) && $this.attr(key) == value ){
340
                returns.push($this);
341
            }
342
        });
343
344
        return returns;
345
    },
346
347
    /**
348
     * Goes through an object and tries to find a key that has a value matching the value passed
349
     *
350
     * @param {*} object
351
     * @param {*} search
352
     * @returns {*}
353
     */
354
    get_key_from_object: function(object, search){
355
        for( var key in object ) if( object[key].indexOf(search) >= 0 ) return key;
356
        return undefined;
357
    },
358
359
    /**
360
     * Gets the status box for the meta tag being tested
361
     * Append to the crawler.painter
362
     *
363
     * @param {Array} tags
364
     * @param {string} tag_name
365
     * @param {string} text
366
     * @param {int} min_char
367
     * @param {int} max_char
368
     * @returns {jQuery}
369
     */
370
    get_meta_tags_status: function(tags, tag_name, text, min_char, max_char){
371
        if( tags.length > 1 ){
372
            return crawler.painter.create_status('error', 'Multiple '+tag_name+' tags');
373
        }else if( tags.length < 1 ){
374
            return crawler.painter.create_status('error', 'Missing '+tag_name+' tag');
375
        }else{
376
            var len = text.length;
377
            if(len < min_char){
378
                return crawler.painter.create_status('warning', tag_name+' is too short');
379
            }else if(len > max_char){
380
                return crawler.painter.create_status('warning', tag_name+' is too long');
381
            }else{
382
                return crawler.painter.create_status('success', 'OK!');
383
            }
384
        }
385
    },
386
387
    /**
388
     * Return a string of links if there is a list of linked_from for the given url
389
     * else return false
390
     *
391
     * @param {string} url
392
     * @returns {string|boolean}
393
     */
394
    get_linked_from_links: function(url){
395
        if( crawler.linked_from.hasOwnProperty( url ) ) {
396
            var linked_from = [];
397
            for (var lf in crawler.linked_from[url]) {
398
                var link = crawler.painter.create_link(crawler.linked_from[url][lf], crawler.linked_from[url][lf]);
399
                linked_from.push(link);
400
            }
401
            return linked_from.join('<br />');
402
        }else{
403
            return false;
404
        }
405
    }
406
}
407
408
// Register the tests
409
crawler.event_handler.on('BEFORE_INIT', function(){
410
    for( var t in default_tests.tests ){
411
        var test = default_tests.tests[t],
412
            func = default_tests.hasOwnProperty( test[0] ) ? default_tests[test[0]] : false;
413
414
        crawler.regiser_test(test[0], test[1], test[2], func);
415
        crawler.painter.set_type(test[0], test[3] || 'default');
416
    }
417
});
418
419
// When crawler is done check for orphan pages
420
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
421
    crawler.painter.set_type('orphan_pages', 'success');
422
    pages_loop:
423
        for( var i in crawler.tested ){
424
            var url = crawler.tested[i];
425
426
            if( crawler.failed.indexOf(url) >= 0 ){
427
                continue;
428
            }
429
430
            if( crawler.linked_from.hasOwnProperty(url) ) {
431
                for (var x in crawler.linked_from[url])
432
                    if (crawler.linked_from[url][x] != url) continue pages_loop;
433
            }
434
435
            crawler.painter.add_row('orphan_pages', [crawler.painter.create_link(crawler.tested[i], crawler.tested[i])]);
436
            crawler.painter.set_type('orphan_pages', 'error');
437
        }
438
439
    return true;
440
});
441
442
// When crawler is done check for duplicate meta tags
443
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
444
    crawler.painter.set_type('duplicate_meta_tags', 'success');
445
446
    var canonicals = crawler.canonicals,
447
        tests      = {
448
            'meta_titles'   : 'Urls have same meta title but different canonicals',
449
            'descriptions'  : 'Urls have same meta description but different canonicals'
450
        };
451
452
    for(var test in tests){
453
        for(var x in crawler[test]){
454
            var urls = crawler[test][x];
455
            if( urls < 2 ) continue;
456
            var canonical = default_tests.get_key_from_object(canonicals, urls[0]);
457
            for( var i in urls )
458
                if( canonical != default_tests.get_key_from_object(canonicals, urls[i]) ) {
459
                    var status = crawler.painter.create_status('error', tests[test]);
460
                    crawler.painter.add_row('duplicate_meta_tags', [urls.join(', '), status]);
461
                    break;
462
                }
463
        }
464
    }
465
466
    return undefined;
467
});
468
469
// When crawler is done check for bad links
470
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
471
    crawler.painter.set_type('bad_links', 'success');
472
    for(var f in crawler.failed){
473
        var links = default_tests.get_linked_from_links(crawler.failed[f]);
474
        if( links != false ){
475
            crawler.painter.add_row('bad_links', [crawler.failed[f], links]);
476
            crawler.painter.set_type('bad_links', 'error');
477
        }
478
    }
479
    return undefined;
480
});
481
482
// When crawler is done check for redirect links
483
crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){
484
    crawler.painter.set_type('redirect_links', 'success');
485
    for(var r in crawler.redirects){
486
        var links = default_tests.get_linked_from_links(r);
487
        if( links != false ){
488
            crawler.painter.add_row('redirect_links', [r, links, crawler.redirects[r]]);
489
            crawler.painter.set_type('redirect_links', 'warning');
490
        }
491
    }
492
    return undefined;
493
});
494